In [2]:
# loading packages

import os, sys, glob
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns

import scanpy as sc
import scanpy.external as sce
import scrublet as scr
import decoupler as dc

sc.settings.verbosity = 3
sc.settings.set_figure_params(dpi=100, dpi_save=2000, figsize=(4, 4), color_map='OrRd', frameon=False, format='svg')
plt.rcParams['svg.fonttype'] = 'none'

%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'
In [3]:
# reading

adata_list = []
for i in ['DMSO', 'CIS', 'PAC', 'TFT', 'DOX']:
    adata = sc.read_10x_h5(f"/nfs/SCMGL/yjk11_bxb1/counts-GRCh38/HeLa-Bxb-{i}/outs/filtered_feature_bc_matrix.h5")
    adata.obs.index.name = None
    adata.var.index.name = None
    adata.var_names_make_unique()
    
    adata.obs['sample'] = i
    adata.obs.index = adata.obs['sample'] + '_' + adata.obs.index

    adata.var['mt'] = adata.var_names.str.startswith('MT-') # change
    sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, inplace=True)
    sce.pp.scrublet(adata) 
    
    adata_list.append(adata)
    
adata = adata_list[0].concatenate(adata_list[1:], index_unique=None)
adata.var = adata.var[['gene_ids', 'mt']]

# quality control

sc.pp.filter_cells(adata, min_counts=2000)
sc.pp.filter_cells(adata, min_genes=500)
adata = adata[adata.obs['pct_counts_mt'] < 10, :]
adata = adata[adata.obs['doublet_score'] < 0.4, :]


# processing

adata.layers['counts'] = adata.X.copy() 
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata.raw = adata

sc.pp.highly_variable_genes(
    adata,
    n_top_genes=2000,
    subset=False,
    layer="counts",
    flavor="seurat_v3",
)

sc.tl.pca(adata)
sc.pp.neighbors(adata, n_pcs=50, use_rep='X_pca')
sc.tl.umap(adata)

cell_cycle_genes = [x.strip()
                    for x in open('/home/yongjunkoh/references/regev_lab_cell_cycle_genes.txt')]
s_genes = cell_cycle_genes[:43]
g2m_genes = cell_cycle_genes[43:]
sc.tl.score_genes_cell_cycle(adata, s_genes, g2m_genes)
reading /nfs/SCMGL/yjk11_bxb1/counts-GRCh38/HeLa-Bxb-DMSO/outs/filtered_feature_bc_matrix.h5
 (0:00:02)
/home/yongjunkoh/.conda/envs/py38/lib/python3.8/site-packages/anndata/_core/anndata.py:1830: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
  utils.warn_names_duplicates("var")
Running Scrublet
filtered out 12485 genes that are detected in less than 3 cells
normalizing counts per cell
    finished (0:00:00)
extracting highly variable genes
    finished (0:00:01)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)
/home/yongjunkoh/.conda/envs/py38/lib/python3.8/site-packages/scanpy/preprocessing/_normalization.py:170: UserWarning: Received a view of an AnnData. Making a copy.
  view_to_actual(adata)
normalizing counts per cell
    finished (0:00:00)
normalizing counts per cell
    finished (0:00:00)
Embedding transcriptomes using PCA...
Automatically set threshold at doublet score = 0.62
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 0.0%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 0.0%
    Scrublet finished (0:00:18)
reading /nfs/SCMGL/yjk11_bxb1/counts-GRCh38/HeLa-Bxb-CIS/outs/filtered_feature_bc_matrix.h5
 (0:00:01)
/home/yongjunkoh/.conda/envs/py38/lib/python3.8/site-packages/anndata/_core/anndata.py:1830: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
  utils.warn_names_duplicates("var")
Running Scrublet
filtered out 10909 genes that are detected in less than 3 cells
normalizing counts per cell
    finished (0:00:00)
extracting highly variable genes
    finished (0:00:00)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)
/home/yongjunkoh/.conda/envs/py38/lib/python3.8/site-packages/scanpy/preprocessing/_normalization.py:170: UserWarning: Received a view of an AnnData. Making a copy.
  view_to_actual(adata)
normalizing counts per cell
    finished (0:00:00)
normalizing counts per cell
    finished (0:00:00)
Embedding transcriptomes using PCA...
Automatically set threshold at doublet score = 0.61
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 0.3%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 10.0%
    Scrublet finished (0:00:17)
reading /nfs/SCMGL/yjk11_bxb1/counts-GRCh38/HeLa-Bxb-PAC/outs/filtered_feature_bc_matrix.h5
 (0:00:01)
/home/yongjunkoh/.conda/envs/py38/lib/python3.8/site-packages/anndata/_core/anndata.py:1830: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
  utils.warn_names_duplicates("var")
Running Scrublet
filtered out 12389 genes that are detected in less than 3 cells
normalizing counts per cell
    finished (0:00:00)
extracting highly variable genes
    finished (0:00:00)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)
/home/yongjunkoh/.conda/envs/py38/lib/python3.8/site-packages/scanpy/preprocessing/_normalization.py:170: UserWarning: Received a view of an AnnData. Making a copy.
  view_to_actual(adata)
normalizing counts per cell
    finished (0:00:00)
normalizing counts per cell
    finished (0:00:00)
Embedding transcriptomes using PCA...
Automatically set threshold at doublet score = 0.61
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 0.1%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 22.2%
    Scrublet finished (0:00:19)
reading /nfs/SCMGL/yjk11_bxb1/counts-GRCh38/HeLa-Bxb-TFT/outs/filtered_feature_bc_matrix.h5
 (0:00:01)
/home/yongjunkoh/.conda/envs/py38/lib/python3.8/site-packages/anndata/_core/anndata.py:1830: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
  utils.warn_names_duplicates("var")
Running Scrublet
filtered out 12245 genes that are detected in less than 3 cells
normalizing counts per cell
    finished (0:00:00)
extracting highly variable genes
    finished (0:00:00)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)
/home/yongjunkoh/.conda/envs/py38/lib/python3.8/site-packages/scanpy/preprocessing/_normalization.py:170: UserWarning: Received a view of an AnnData. Making a copy.
  view_to_actual(adata)
normalizing counts per cell
    finished (0:00:00)
normalizing counts per cell
    finished (0:00:00)
Embedding transcriptomes using PCA...
Automatically set threshold at doublet score = 0.57
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 0.4%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 12.2%
    Scrublet finished (0:00:14)
reading /nfs/SCMGL/yjk11_bxb1/counts-GRCh38/HeLa-Bxb-DOX/outs/filtered_feature_bc_matrix.h5
 (0:00:01)
/home/yongjunkoh/.conda/envs/py38/lib/python3.8/site-packages/anndata/_core/anndata.py:1830: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
  utils.warn_names_duplicates("var")
Running Scrublet
filtered out 11512 genes that are detected in less than 3 cells
normalizing counts per cell
    finished (0:00:00)
extracting highly variable genes
    finished (0:00:00)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)
/home/yongjunkoh/.conda/envs/py38/lib/python3.8/site-packages/scanpy/preprocessing/_normalization.py:170: UserWarning: Received a view of an AnnData. Making a copy.
  view_to_actual(adata)
normalizing counts per cell
    finished (0:00:00)
normalizing counts per cell
    finished (0:00:00)
Embedding transcriptomes using PCA...
Automatically set threshold at doublet score = 0.59
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 0.4%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 8.0%
    Scrublet finished (0:00:15)
/home/yongjunkoh/.conda/envs/py38/lib/python3.8/site-packages/anndata/_core/anndata.py:1785: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.
  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
filtered out 4175 cells that have less than 2000 counts
filtered out 24 cells that have less than 500 genes expressed
normalizing counts per cell
    finished (0:00:00)
If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
--> added
    'highly_variable', boolean vector (adata.var)
    'highly_variable_rank', float vector (adata.var)
    'means', float vector (adata.var)
    'variances', float vector (adata.var)
    'variances_norm', float vector (adata.var)
computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:08)
computing neighbors
2025-08-28 02:33:23.329965: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-28 02:33:23.605620: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-08-28 02:33:25.232545: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-08-28 02:33:25.234229: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-08-28 02:33:28.793140: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:45)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:27)
calculating cell cycle phase
computing score 'S_score'
WARNING: genes are not in var_names and ignored: ['MLF1IP']
/home/yongjunkoh/.conda/envs/py38/lib/python3.8/site-packages/scanpy/tools/_score_genes.py:151: FutureWarning: Passing a set as an indexer is deprecated and will raise in a future version. Use a list instead.
  for cut in np.unique(obs_cut.loc[gene_list]):
    finished: added
    'S_score', score of gene set (adata.obs).
    344 total control genes are used. (0:00:02)
computing score 'G2M_score'
WARNING: genes are not in var_names and ignored: ['FAM64A', 'HN1']
/home/yongjunkoh/.conda/envs/py38/lib/python3.8/site-packages/scanpy/tools/_score_genes.py:151: FutureWarning: Passing a set as an indexer is deprecated and will raise in a future version. Use a list instead.
  for cut in np.unique(obs_cut.loc[gene_list]):
    finished: added
    'G2M_score', score of gene set (adata.obs).
    214 total control genes are used. (0:00:02)
-->     'phase', cell cycle phase (adata.obs)
In [4]:
sc.pl.umap(adata, color=['sample'])
No description has been provided for this image
In [12]:
# changing color and orders

adata.obs['sample'] = adata.obs['sample'].cat.reorder_categories(['DMSO', 'CIS', 'PAC', 'TFT', 'DOX'])
adata.uns['sample_colors'] = ['#e3e3dd', '#28b463', '#2f86c1', '#b03a2e', '#7c3b97']

sc.pl.umap(adata, color=['sample'])
No description has been provided for this image
In [11]:
# clustering

sc.tl.leiden(adata, resolution=0.5, key_added='leiden_0.5')

sc.pl.umap(adata, color=['leiden_0.5'])
running Leiden clustering
    finished: found 9 clusters and added
    'leiden_0.5', the cluster labels (adata.obs, categorical) (0:00:04)
No description has been provided for this image
In [9]:
# deg analysis

adata.uns['log1p']['base'] = None

sc.tl.rank_genes_groups(adata, 
                        groupby='leiden_0.5',
                        method='wilcoxon', 
                        key_added='leiden_0.5_wilcoxon')

with pd.ExcelWriter('hela-wilcoxon-markers_230626_yongjunkoh.xlsx') as writer:
    for i in adata.obs['leiden_0.5'].cat.categories:
        sc.get.rank_genes_groups_df(adata, group=i, key='leiden_0.5_wilcoxon').to_excel(writer, sheet_name=i)
In [10]:
# inspecting genes

sc.pl.umap(adata, color=['TYMS'])
sc.pl.umap(adata, color=['BCL2'])
sc.pl.umap(adata, color=['CDK6'])
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [13]:
# decoupler progeny (pathway analysis)

progeny = dc.get_progeny(organism='human', top=500)

dc.run_mlm(
    mat=adata,
    net=progeny,
    source='source',
    target='target',
    weight='weight',
    verbose=True
)

adata.obsm['progeny_mlm_estimate'] = adata.obsm['mlm_estimate'].copy()
adata.obsm['progeny_mlm_pvals'] = adata.obsm['mlm_pvals'].copy()

adata.obsm['progeny_mlm_estimate'] = adata.obsm['mlm_estimate'].copy()
adata.obsm['progeny_mlm_pvals'] = adata.obsm['mlm_pvals'].copy()

acts = dc.get_acts(adata, obsm_key='mlm_estimate')

sc.pl.matrixplot(acts, var_names=acts.var_names, groupby='sample', dendrogram=True, standard_scale='var', colorbar_title='Z-scaled scores', cmap='RdBu_r')
sc.pl.umap(acts, color=['p53'], cmap='RdBu_r', vcenter=0)
4835 features of mat are empty, they will be removed.
Running mlm on mat with 32952 samples and 31766 targets for 14 sources.
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:56<00:00, 14.13s/it]
WARNING: dendrogram data not found (using key=dendrogram_sample). Running `sc.tl.dendrogram` with default parameters. For fine tuning it is recommended to run `sc.tl.dendrogram` independently.
    using data matrix X directly
Storing dendrogram info using `.uns['dendrogram_sample']`
No description has been provided for this image
No description has been provided for this image
In [14]:
acts_p = acts.copy()
In [15]:
# decoupler collectri (tf analysis)

net = dc.get_collectri(organism='human', split_complexes=False)

dc.run_ulm(
    mat=adata,
    net=net,
    source='source',
    target='target',
    weight='weight',
    verbose=True
)

adata.obsm['collectri_ulm_estimate'] = adata.obsm['ulm_estimate'].copy()
adata.obsm['collectri_ulm_pvals'] = adata.obsm['ulm_pvals'].copy()

acts = dc.get_acts(adata, obsm_key='ulm_estimate')

df = dc.rank_sources_groups(acts, groupby='sample', reference='rest', method='t-test_overestim_var')
n_markers = 10
source_markers = df.groupby('group').head(n_markers).groupby('group')['names'].apply(lambda x: list(x)).to_dict()

sc.pl.matrixplot(acts, source_markers, 'sample', dendrogram=True, standard_scale='var', colorbar_title='Z-scaled scores', cmap='RdBu_r')
4835 features of mat are empty, they will be removed.
Running ulm on mat with 32952 samples and 31766 targets for 750 sources.
WARNING: dendrogram data not found (using key=dendrogram_sample). Running `sc.tl.dendrogram` with default parameters. For fine tuning it is recommended to run `sc.tl.dendrogram` independently.
    using 'X_pca' with n_pcs = 50
Storing dendrogram info using `.uns['dendrogram_sample']`
No description has been provided for this image
In [18]:
# save
sc.settings.set_figure_params(dpi=100, dpi_save=2000, figsize=(4, 4), color_map='OrRd', frameon=False)

sc.pl.umap(adata, color=['sample'], save='_sample.svg')
sc.pl.umap(adata, color=['leiden_0.5'], save='_cluster.svg')

sc.pl.umap(adata, color=['TYMS'], save='_TYMS.svg')
sc.pl.umap(adata, color=['BCL2'], save='_BCL2.svg')
sc.pl.umap(adata, color=['CDK6'], save='_CDK6.svg')

sc.pl.umap(acts_p, color=['p53'], cmap='RdBu_r', vcenter=0, save='_p53.svg')
sc.pl.matrixplot(acts, source_markers, 'sample', dendrogram=True, standard_scale='var', colorbar_title='Z-scaled scores', cmap='RdBu_r', save='collectri.svg')
WARNING: saving figure to file figures/umap_sample.svg
No description has been provided for this image
WARNING: saving figure to file figures/umap_cluster.svg
No description has been provided for this image
WARNING: saving figure to file figures/umap_TYMS.svg
No description has been provided for this image
WARNING: saving figure to file figures/umap_BCL2.svg
No description has been provided for this image
WARNING: saving figure to file figures/umap_CDK6.svg
No description has been provided for this image
WARNING: saving figure to file figures/umap_p53.svg
No description has been provided for this image
WARNING: saving figure to file figures/matrixplot_collectri.svg
No description has been provided for this image
In [19]:
# save
sc.settings.set_figure_params(dpi=100, dpi_save=2000, figsize=(4, 4), color_map='OrRd', frameon=False)

sc.pl.umap(adata, color=['sample'], save='_sample.pdf')
sc.pl.umap(adata, color=['leiden_0.5'], save='_cluster.pdf')

sc.pl.umap(adata, color=['TYMS'], save='_TYMS.pdf')
sc.pl.umap(adata, color=['BCL2'], save='_BCL2.pdf')
sc.pl.umap(adata, color=['CDK6'], save='_CDK6.pdf')

sc.pl.umap(acts_p, color=['p53'], cmap='RdBu_r', vcenter=0, save='_p53.pdf')
sc.pl.matrixplot(acts, source_markers, 'sample', dendrogram=True, standard_scale='var', colorbar_title='Z-scaled scores', cmap='RdBu_r', save='collectri.pdf')
WARNING: saving figure to file figures/umap_sample.pdf
No description has been provided for this image
WARNING: saving figure to file figures/umap_cluster.pdf
No description has been provided for this image
WARNING: saving figure to file figures/umap_TYMS.pdf
No description has been provided for this image
WARNING: saving figure to file figures/umap_BCL2.pdf
No description has been provided for this image
WARNING: saving figure to file figures/umap_CDK6.pdf
No description has been provided for this image
WARNING: saving figure to file figures/umap_p53.pdf
No description has been provided for this image
WARNING: saving figure to file figures/matrixplot_collectri.pdf
No description has been provided for this image
In [ ]: